
# to process data
library(tidyverse)
library(lubridate)

# to manipulate text data
library(quanteda)



# load IMF conditionality data for the 74 countries of interest
load("imf_conditionality.RData")


# conditions reference several national oil companies (NOC)
# we don't care about the individual NOC as much as about the fact that it's a NOC
# so replace each mention of a NOC with the abbreviation "noc"  
full_imf_sample <- full_imf_sample %>%
  mutate(text = gsub("extrabudgetary", "extra-budgetary", text),
         text = gsub("Sonelgaz|Sonangol|SOCAR|AzeriGas|Azerigas|Azerigaz|SONABEL|SONABHY|SNH|SONARA|PETROCA|SNPC|SOGARA|
                     PETROCI|PETROCI.|SOMAGAZ|SONIDEP|NNPC|Gazprom|Ukrgazprom|Ukrgazprom.|OTP|Naftogaz|Naftogas|ofNaftogas|PDVSA","noc", text))

# for the main analysis, only keep binding conditions
# note: there are a total of 435 agreements, but 33 of these agreements consist exclusively of non-binding conditions
# e.g. Afghanistan 2016-2019 (id 745) consists of 55 structural benchmarks (SB), which (according to Copelovitch 2010 and others) are not binding
# once we limit the sample to non-binding conditions, we lose these 33 agreements
# hence the total number of agreements mentioned in the paper is 402
binding <- full_imf_sample %>%
  filter(binding==1)



# generate corpus
key_corpus <- corpus(binding, text_field = "text")

# pre-processing decisions can be arbitrary and misleading (Denny and Spirling 2018)
# so we deliberately do not stem words and do not remove infrequent terms 
data_tokens <- tokens(key_corpus,
                      remove_numbers = TRUE,
                      remove_punct = TRUE,
                      remove_symbols = TRUE,
                      remove_separators = TRUE,
                      remove_url = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(c(stopwords("english")))


# set search terms
natural_resource_dictionary <- dictionary(list(resources = c("natural", "extractive", "oil","petroleum",
                                                          "crude", "petroleum", "gas", "gasoline", "diesel","electricity",
                                                          "fuel", "fuels","energy", "refinery", "hydrocarbon", "mineral",
                                                          "mining", "mine", "copper", "gold", "diamond",
                                                          "iron", "steel", "phosphate", "eiti","noc")))

### absolute values

# create a document-feature matrix (a dfm object) from a token object
data_dfm <- dfm(data_tokens)

# run search on the dfm
search_hits_abs <- dfm_lookup(data_dfm, natural_resource_dictionary)

# convert the resulting dfm to data frame
search_results_abs <- convert(search_hits_abs, to = "data.frame") %>% 
  rename(resources_abs = resources)


### weighted values
# weigh a dfm by term frequency-inverse document frequency (tf-idf)
# what this means: the weight of a term that occurs in a document is proportional to the term frequency
# but it is offset by the number of documents in the corpus that contain the word (inverse function of the frequency)

# create a document-feature matrix (a dfm object) from a token object - now weighted
data_dfm_tfidf <- dfm_tfidf(data_dfm)

# run search on the dfm
search_hits_tfidf <- dfm_lookup(data_dfm_tfidf, natural_resource_dictionary)

# convert the resulting dfm to data frame
search_results_tfidf <- convert(search_hits_tfidf, to = "data.frame") %>% 
  rename(resources_tfidf = resources)


# combine
results_binding_dictionary <- search_results_abs %>%
  full_join(search_results_tfidf) %>%
  cbind(binding) %>%
  select(id:labor_policy,resources_abs,resources_tfidf)


save.image(file="workspace_dictionary_methods.Rdata")

